#-*- coding:utf-8 -*-
__author__ = 'cgd'


import feedparser
import jieba
import jieba.posseg
import jieba.analyse

feed_list = ["http://36kr.com/feed", \
             "http://www.tmtpost.com/rss.xml", \
             "http://www.leiphone.com/feed", \
             "http://www.geekpark.net/rss", \
             "http://news.iheima.com/index.php?m=content&c=rss&a=mycreaterss", \
             "http://www.huxiu.com/rss/0.xml", \
             "http://n.rss.qq.com/rss/tech_rss.php", \
             "http://www.ifanr.com/feed" ]

feed_file_list = ["./test_data/1.xml", \
                  "./test_data/geekpark.xml", \
                  "./test_data/huxiu.xml", \
                  "./test_data/ifanr.xml", \
                  "./test_data/iheima.xml", \
                  "./test_data/rss.xml", \
                  "./test_data/qqtech_rss.php"]



def cutMsg(msg,power,seg_dict):
    seg_list = tag = jieba.analyse.extract_tags(msg, topK=20,withWeight=True)
    for seg,p in seg_list:
        #print seg.word, seg.flag
        addSeg(seg, p*power,seg_dict)

def addSeg(seg,power,seg_dict):
    if seg_dict.has_key(seg):
        seg_dict[seg] = seg_dict[seg] + power
    else:
        seg_dict[seg] = power

def gethotword():
    seg_dict = {}
    for feed_file in feed_list:
        d = feedparser.parse(feed_file)
        for item in d["items"]:
            cutMsg(item['title'], 10, seg_dict)
            cutMsg(item['summary'], 1, seg_dict)
    return seg_dict

def resize_power(list, max_value = 20):
    max = 0
    min = 99999999
    resize_power_list = []
    for k,v in list:
        if v > max: max = v
        if v < min: min = v
    for i in xrange(0,len(list)):
        new_power = ( (list[i][1]-min) / (max -min)) * max_value
        resize_power_list.append ((list[i][0], new_power))

    return resize_power_list

def gethotwordlist():
    seg_dict = gethotword()
    seg_list = seg_dict.items()
    seg_list.sort(key=lambda x:x[1],reverse=True)
    if len(seg_list) >20:
        seg_list = seg_list[0:20]
    else:
        seg_list = seg_list[0:len(seg_list)]
    return resize_power(seg_list)

if __name__ == "__main__":
    seg_list = gethotwordlist()
    for k,v in seg_list:
		print k,v

